# code = utf-8

# linux 下 定时爬取 :
# crontab -e 
# 在vi中输入 0 7 * * * /程序目录
# 保存后每天七点自动爬取


import re
import urllib.request

url = 'https://www.csdn.net/'

nameset = set()

page = urllib.request.urlopen(url).read().decode('utf-8')
pat = 'title=".*?" href="(.*?)"'
urlrst = re.compile(pat).findall(page)
pat = 'title="(.*?)" href=".*?"'
namerst = re.compile(pat).findall(page)

for name, url in zip(namerst, urlrst) :
    name = name.replace('/', '|')
    if name in nameset : continue
    nameset.add(name)
    # print (name, ' ' , url)
    try :
        address = '/home/coladog/CSDN-crawler/' + str(name) + '.html'
        urllib.request.urlretrieve(url, address)
    except :
        pass